library(rstudioapi)
library(stringr)
library(pivottabler)
library(data.table)
library(plotly)
library(pheatmap)
library(viridis)
library(knitr)
app_dir <- str_split(rstudioapi::getActiveDocumentContext()$path, 'icb_data_exploration_and_analysis')[[1]][1]
app_dir <- paste0(app_dir, 'icb_data_exploration_and_analysis')
setwd(app_dir)
dataset_names <- list.dirs(file.path(app_dir, '.local_data'), full.names = FALSE, recursive = FALSE)
study_colnames <- list()
for(study in dataset_names){
study_df <- read.csv(file.path(app_dir, '.local_data', study, paste0(study, '_metadata.tsv')), sep='\t')
study_colnames[[study]] <- colnames(study_df)
}
common_colnames <- Reduce(intersect, study_colnames)
common_colnames <- c(common_colnames, c('Treatment.regimen', 'Drug.targets', 'Metastasis.status', 'Monotherapy', 'Combination'))
# common_colnames <- common_colnames[!common_colnames %in% c("TMB_raw", "nsTMB_raw", "indel_TMB_raw", "indel_nsTMB_raw", "TMB_perMb", "nsTMB_perMb", "indel_TMB_perMb", "indel_nsTMB_perMb", "CIN", "CNA_tot", "AMP", "DEL")]
# df_merged <- data.frame(matrix(NA, 0, length(c('study', common_colnames)), dimnames=list(c(), c('study', common_colnames))), stringsAsFactors=F)
df_merged <- read.csv(file.path(app_dir, 'Data', 'all_metadata.tsv'), sep='\t')
total_rows <- 0
df_metadata <- data.frame(matrix(NA, length(common_colnames), length(dataset_names), dimnames=list(common_colnames, dataset_names)), stringsAsFactors=F)
df_num_genes <- data.frame(matrix(NA, length(dataset_names), 2, dimnames=list(dataset_names, c('num_patients', 'num_genes'))), stringsAsFactors=F)
all.features <- list()
for(study in dataset_names){
study_df <- df_merged[df_merged$study == study, ]
study_df <- cbind(study = study, study_df)
expr_df <- NA
if(file.exists(file.path(app_dir, '.local_data', study, paste0(study, '_expr.tsv')))){
expr_df <- read.csv(file.path(app_dir, '.local_data', study, paste0(study, '_expr.tsv')), sep='\t')
}else{
expr_df <- read.csv(file.path(app_dir, '.local_data', study, paste0(study, '_expr_gene_tpm.tsv')), sep='\t')
}
all.features[[study]] <- rownames(expr_df)
df_metadata[, study] <- unlist(lapply(common_colnames, function(col_name){
sum(!is.na(study_df[, col_name]))
}))
df_num_genes[study, 'num_patients'] <- ncol(expr_df)
df_num_genes[ study, 'num_genes'] = nrow(expr_df)
}
Number of available genes in each dataset
- Studies with low number of genes are excluded: ICB_Hwang, ICB_Jerby_Arnon and ICB_Roh.
df_num_genes <- df_num_genes[order(df_num_genes$num_genes, decreasing = TRUE), ]
kable(df_num_genes, caption='')
| ICB_Gide |
41 |
61544 |
| ICB_Hugo |
27 |
61544 |
| ICB_Jung |
27 |
61544 |
| ICB_Kim |
45 |
61544 |
| ICB_Riaz |
46 |
61544 |
| ICB_Miao1 |
33 |
57820 |
| ICB_Van_Allen |
42 |
57820 |
| ICB_Nathanson |
24 |
52230 |
| ICB_Braun |
181 |
40994 |
| ICB_Snyder |
25 |
34906 |
| ICB_VanDenEnde |
35 |
31031 |
| ICB_Mariathasan |
348 |
23005 |
| ICB_Padron |
45 |
21366 |
| ICB_Liu |
121 |
20212 |
| ICB_Puch |
55 |
18789 |
| ICB_Shiuan |
15 |
17235 |
| ICB_Roh |
16 |
721 |
| ICB_Jerby_Arnon |
112 |
587 |
| ICB_Hwang |
21 |
384 |
Common genes across datasets
- Pair-wise comparison between the genes that are common across two datasets.
- Identified which datasets have the most common, or least common number of genes(features).
- The number of genes are in log scale for simplified visualization.
excluded_datasets <- c('ICB_Hwang', 'ICB_Jerby_Arnon', 'ICB_Roh')
dataset_names_common_genes <- dataset_names[!dataset_names %in% excluded_datasets]
temp.features <- all.features[names(all.features)[!names(all.features) %in% excluded_datasets]]
lapply(temp.features,head) # everything looks good!
## $ICB_Braun
## [1] "ENSG00000000003.10" "ENSG00000000005.5" "ENSG00000000419.8"
## [4] "ENSG00000000457.9" "ENSG00000000460.12" "ENSG00000000938.8"
##
## $ICB_Gide
## [1] "ENSG00000000003.15" "ENSG00000000005.6" "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
##
## $ICB_Hugo
## [1] "ENSG00000000003.15" "ENSG00000000005.6" "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
##
## $ICB_Jung
## [1] "ENSG00000000003.15" "ENSG00000000005.6" "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
##
## $ICB_Kim
## [1] "ENSG00000000003.15" "ENSG00000000005.6" "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
##
## $ICB_Liu
## [1] "ENSG00000000003.15" "ENSG00000000005.6" "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
##
## $ICB_Mariathasan
## [1] "ENSG00000000003.15" "ENSG00000000005.6" "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
##
## $ICB_Miao1
## [1] "ENSG00000000003.10" "ENSG00000000005.5" "ENSG00000000419.8"
## [4] "ENSG00000000457.9" "ENSG00000000460.12" "ENSG00000000938.8"
##
## $ICB_Nathanson
## [1] "ENSG00000000003.15" "ENSG00000000005.6" "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
##
## $ICB_Padron
## [1] "ENSG00000000003.15" "ENSG00000000005.6" "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
##
## $ICB_Puch
## [1] "ENSG00000000003.10" "ENSG00000000005.5" "ENSG00000000419.8"
## [4] "ENSG00000000457.9" "ENSG00000000460.12" "ENSG00000000938.8"
##
## $ICB_Riaz
## [1] "ENSG00000000003.15" "ENSG00000000005.6" "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
##
## $ICB_Shiuan
## [1] "ENSG00000000003.15" "ENSG00000000005.6" "ENSG00000000419.14"
## [4] "ENSG00000000457.14" "ENSG00000000460.17" "ENSG00000000938.13"
##
## $ICB_Snyder
## [1] "ENSG00000000003.10" "ENSG00000000005.5" "ENSG00000000419.8"
## [4] "ENSG00000000457.9" "ENSG00000000460.12" "ENSG00000000938.8"
##
## $ICB_Van_Allen
## [1] "ENSG00000000003.10" "ENSG00000000005.5" "ENSG00000000419.8"
## [4] "ENSG00000000457.9" "ENSG00000000460.12" "ENSG00000000938.8"
##
## $ICB_VanDenEnde
## [1] "ENSG00000003056.3" "ENSG00000005073.5" "ENSG00000006116.3"
## [4] "ENSG00000008197.4" "ENSG00000019995.6" "ENSG00000039139.9"
temp <- unlist(temp.features)
temp <- unique(temp) # this is the 93819 features Jonas reported
num_datasets <- length(dataset_names_common_genes)
i=1
for (i in 2:num_datasets) {
temp <- base::intersect(temp,temp.features[[i]])
}
temp.table <- matrix(nrow=num_datasets, ncol=num_datasets)
for (i in 1:num_datasets) {
for (j in 1:num_datasets) {
temp.table[i,j] <- length(base::intersect(temp.features[[i]],temp.features[[j]]))
}
}
colnames(temp.table) <- names(temp.features)
rownames(temp.table) <- names(temp.features)
range(temp.table) # 160 61544
## [1] 160 61544
pheatmap(log10(temp.table), color = viridis(30))

datasets_to_remove <- c('ICB_Puch', 'ICB_Shiuan', 'ICB_Liu', 'ICB_Padron', 'ICB_Mariathasan')
# datasets_to_remove <- c()
available_datasets <- df_merged[!df_merged$study %in% excluded_datasets, ]
all_patients <- length(rownames(available_datasets))
all_genes <- range(temp.table)[2]
available_datasets <- available_datasets[!available_datasets$study %in% datasets_to_remove, ]
available_patients <- length(rownames(available_datasets))
overlapping_genes <- temp.table[!rownames(temp.table) %in% datasets_to_remove, !colnames(temp.table) %in% datasets_to_remove]
available_genes <- range(overlapping_genes)[1]
# paste0('total genes ', all_genes)
# paste0('total patients ', all_patients)
# paste0('available patients ', available_patients)
# paste0('available genes ', available_genes)
colors <- c('#FF7F0E', '#c6c6c6')
available_patients_pct <- (available_patients/all_patients) * 100
patients <- data.frame(
availability=c('available', 'unavailable'),
percentage=c(available_patients_pct, 100 - available_patients_pct)
)
pie_patients <- plot_ly(
patients,
labels = ~availability,
values = ~percentage,
marker = list(colors = colors),
type = 'pie'
)
pie_patients <- pie_patients %>% layout(title = 'Available Patients',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
available_genes_pct <- (available_genes/all_genes) * 100
genes <- data.frame(
availability=c('available', 'unavailable'),
percentage=c(available_genes_pct, 100 - available_genes_pct)
)
pie_genes <- plot_ly(
genes,
labels = ~availability,
values = ~percentage,
marker = list(colors = colors),
type = 'pie'
)
pie_genes <- pie_genes %>% layout(title = 'Available Genes',
xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))
pie_patients
pie_genes